
from init import PATHS
import logging
LOGGER = logging.getLOGGER(__name__)
import pandas as pd
import os
import numpy as np

"""
Calculate the nbr of citations that a family receives within X number of years.
PATSTAT Table 228 provides info about which family cites which other family 
by adding the earliest filing year of these families, we can calculate the time until citation
there are quite a few negative values however - indicating that a more recent family is being cited by an older family 
This may happen because it is not the families by themselves are not cited. 
It is the patent applications that belonged to the family that are cited. 
Families typically contain several applications that may have been filed in different years. 
These applications are then cited by other applications or publications. 
If family 1 has several applications filed between 2000 and 2005, and Family 2 applications filed between 2002 and 2010. 
it may be possible that the last application in Family 1 (filed in 2005) cites the first application of family 2 (filed in 2002)
so in the end it would look like Family 1 in 2000 cited Fam 2 which appeared in 2002. 
i.e., as if Family 1 could see in the future...!

A) we can calculate within X years citations, ignoring this problem and simply changing the negative values to 0 

B) Or we can go back to the citation data at the application level, provided in Table 212
for each pair of applications cited-citing, we can calculate the time to citations
and aggregate that at family level 
hopefully fewer negative values???

"""


def main():
    LOGGER.info("SCRIPT: a_cleancitations.py")
    # Calculate citations from application data
    df_appln = get_all_tls201info()
    aggregate_at_family_level(df_appln)
    get_citationpair_date()
    countwithinXyearscitations_and_aggregateatfamilylevel()
    LOGGER.info("SCRIPT: The End")
    return


def get_all_tls201info():
    # get basic info about applications, their family id and dates
    cols = ['appln_id', 'appln_kind', 'appln_filing_date', 'earliest_filing_date', 'earliest_publn_date', 'docdb_family_id', 'nb_citing_docdb_fam']
    list_files = [f for f in os.listdir(PATHS.patstatglobal) if 'tls201' in f]
    df_appln = []
    for file in list_files:
        LOGGER.info('       {}'.format(file))
        for num, dfchunk in enumerate(pd.read_csv(PATHS.patstatglobal / file, chunksize=500000, usecols=cols)):
            df_appln.append(dfchunk)
    df_appln = pd.concat(df_appln)
    return df_appln


def aggregate_at_family_level(df_appln):
    df_fam = df_appln[['docdb_family_id', 'earliest_filing_date', 'nb_citing_docdb_fam']].drop_duplicates()
    df_fam.index = df_fam['docdb_family_id']
    df_fam['appln_count'] = df_appln.groupby('docdb_family_id')['appln_id'].count()
    df_appln['appln_filing_date'] = df_appln['appln_filing_date'].replace('9999-12-31', np.nan)
    df_appln['appln_filing_date'] = pd.to_datetime(df_appln['appln_filing_date'])
    df_fam['appln_filing_date_min'] = df_appln.groupby('docdb_family_id')['appln_filing_date'].min()
    df_fam['appln_filing_date_max'] = df_appln.groupby('docdb_family_id')['appln_filing_date'].max()
    df_fam['appln_kinds'] = df_appln.groupby('docdb_family_id')['appln_kind'].agg(lambda x: x.unique())
    df_fam['appln_kinds'] = df_fam['appln_kinds'].apply(lambda x: ','.join([k.strip() for k in x]))
    df_fam.to_csv(PATHS.citations / 'docdbid_applninfo.csv', index=False)


def get_citationpair_date():
    LOGGER.info("FTC: get_citationpair_date")
    # Start from citation data at publication/application level
    list_files = [f for f in os.listdir(PATHS.patstatglobal) if 'tls212' in f]
    cols = ['pat_publn_id', 'cited_pat_publn_id', 'cited_appln_id']
    for file in list_files:
        LOGGER.info('       {}'.format(file))
        df_pairs = pd.read_csv(PATHS.patstatglobal / file, usecols=cols)
        df_pairs = df_pairs.rename(columns={'pat_publn_id': 'citing_pat_publn_id'})
        df_pairs = get_applicationids(df_pairs)
        df_pairs = add_yearcitationhappened(df_pairs)
        df_pairs = df_pairs[['citing_docdb_family_id', 'citing_appln_filing_date', 'cited_docdb_family_id', 'cited_appln_filing_date']].drop_duplicates()
        df_pairs = df_pairs[df_pairs['citing_docdb_family_id'].notnull()]
        df_pairs = df_pairs[df_pairs['cited_docdb_family_id'] != 0]
        newfile = file.split('_')[-1].split('.')[0]
        df_pairs.to_csv(PATHS.citations / f'docdbid_citations_date_{newfile}.csv', index=False)


def get_applicationids(df_pairs):
    LOGGER.info("FTC: get_applicationids")
    # Add appln_id of the publications
    df_applnids = []
    for file in [f for f in os.listdir(PATHS.patstatglobal) if 'tls211' in f]:
        LOGGER.info('       {}'.format(file))
        cols = ['pat_publn_id', 'appln_id']
        for num, dfchunk in enumerate(pd.read_csv(PATHS.patstatglobal / file, chunksize=500000, usecols=cols)):
            dfchunk = dfchunk[dfchunk['pat_publn_id'].isin(df_pairs['citing_pat_publn_id']) | dfchunk['pat_publn_id'].isin(df_pairs['cited_pat_publn_id'])]
            df_applnids.append(dfchunk)
    df_applnids = pd.concat(df_applnids)
    df_pairs = df_pairs.merge(df_applnids, left_on='citing_pat_publn_id', right_on='pat_publn_id', how='left')
    df_pairs = df_pairs.rename(columns={'appln_id': 'citing_appln_id'}).drop(columns=['pat_publn_id'])
    df_pairs = df_pairs.merge(df_applnids, left_on='cited_pat_publn_id', right_on='pat_publn_id', how='left')
    df_pairs = df_pairs.rename(columns={'appln_id': 'cited_publn_appln_id'}).drop(columns=['pat_publn_id'])
    df_pairs = df_pairs[['citing_appln_id', 'citing_pat_publn_id', 'cited_pat_publn_id', 'cited_publn_appln_id', 'cited_appln_id']]
    df_pairs['cited_appln_id_combined'] = df_pairs.apply(lambda row: row['cited_publn_appln_id'] if row['cited_publn_appln_id'] > 0 else row['cited_appln_id'], axis=1)
    return df_pairs


def add_yearcitationhappened(df_pairs):
    LOGGER.info("FTC: add_yearcitationhappened")
    # Add year when the citing application was filed
    cols = ['appln_id', 'appln_filing_date', 'docdb_family_id']
    list_files = [f for f in os.listdir(PATHS.patstatglobal) if 'tls201' in f]
    df_filingdate = []
    for file in list_files:
        LOGGER.info('       {}'.format(file))
        for num, dfchunk in enumerate(pd.read_csv(PATHS.patstatglobal / file, chunksize=500000, usecols=cols)):
            dfchunk = dfchunk[dfchunk['appln_id'].isin(df_pairs['cited_appln_id_combined']) | dfchunk['appln_id'].isin(df_pairs['citing_appln_id'])]
            df_filingdate.append(dfchunk)
    df_filingdate = pd.concat(df_filingdate)
    df_pairs = df_filingdate.merge(df_pairs, left_on='appln_id', right_on='citing_appln_id', how='right')
    df_pairs = df_pairs.rename(columns={'docdb_family_id': 'citing_docdb_family_id'}).drop(columns=['appln_id'])
    df_pairs = df_pairs.rename(columns={'appln_filing_date': 'citing_appln_filing_date'})
    df_pairs = df_pairs.merge(df_filingdate, left_on='cited_appln_id_combined', right_on='appln_id', how='left')
    df_pairs = df_pairs.rename(columns={'docdb_family_id': 'cited_docdb_family_id'}).drop(columns=['appln_id'])
    df_pairs = df_pairs.rename(columns={'appln_filing_date': 'cited_appln_filing_date'})
    return df_pairs


def calculate_citations(df_pairs):
    # Calculate total nbr of citations, total nbr of citations within 2 years, 3 years, 4 years, 5 years
    # Cleaning the data variables
    df_pairs['cited_appln_filing_date'] = df_pairs['cited_appln_filing_date'].replace('9999-12-31', np.nan)
    df_pairs['cited_appln_filing_date'] = pd.to_datetime(df_pairs['cited_appln_filing_date'])
    df_pairs['citing_appln_filing_date'] = df_pairs['citing_appln_filing_date'].replace('9999-12-31', np.nan)
    df_pairs['citing_appln_filing_date'] = pd.to_datetime(df_pairs['citing_appln_filing_date'])
    df_pairs['lapse_days'] = df_pairs['citing_appln_filing_date'] - df_pairs['cited_appln_filing_date']
    df_pairs['lapse_years'] = df_pairs['lapse_days']/np.timedelta64(1, 'Y')
    desc = df_pairs['lapse_years'].describe()
    LOGGER.info(f'Descriptives lapse_years: \n{desc}')
    percent_withnegative = 100 * df_pairs[df_pairs['lapse_years'] < 0].shape[0] / df_pairs.shape[0]
    LOGGER.info(f'Percent of observations with negative values: {percent_withnegative} % ')
    df_pairs['lapse_years_rounded'] = round(df_pairs['lapse_years'])
    citations_distribution = df_pairs.groupby('lapse_years_rounded')['citing_docdb_family_id'].count().reset_index()
    citations_distribution.to_csv(PATHS.citations / 'docdbid_citationcountdistribution_fromapplndata.csv', index=False)
    LOGGER.info('Saved  docdbid_citationcountdistribution_fromapplndata.csv')
    df_citations = pd.DataFrame(df_pairs.groupby('cited_docdb_family_id')['citing_docdb_family_id'].nunique().rename('total'))
    for year in [1, 3, 4, 5, 10, 15]:
        df_citations[f'within_{year}yr'] = df_pairs[df_pairs['lapse_years'] < year].groupby('cited_docdb_family_id')['citing_docdb_family_id'].nunique()
    df_citations = df_citations.reset_index()
    df_citations = df_citations.fillna(0)
    df_citations.to_csv(PATHS.citations / 'docdbid_citationcounts_fromapplndata.csv', index=False)
    LOGGER.info('Saved  docdbid_citationcounts_fromapplndata.csv')
    return


def countwithinXyearscitations_and_aggregateatfamilylevel():
    LOGGER.info('Begin countwithinXyearscitations_and_aggregateatfamilylevel')
    # open the files containing info about cited-citing pairs
    # and calculate nbr of citations within 1, 3, 4, ... years
    list_files = sorted([f for f in os.listdir(PATHS.patstatglobal) if 'tls212' in f])
    df_pairs = []
    for file in list_files:
        LOGGER.info(f'File = {file}')
        newfile = file.split('_')[-1].split('.')[0]
        df_pairs_chunk = pd.read_csv( + f'/data/intermediate_data/b_extracting_from_patstat/docdbid_citations_date_{newfile}.csv')
        df_pairs.append(df_pairs_chunk)
    df_pairs = pd.concat(df_pairs)
    calculate_citations(df_pairs)



def get_citationpairs_fromfamilydata():
    list_files = [f for f in os.listdir(PATHS.patstatglobal) if 'tls228' in f]
    df_pairs = []
    for file in list_files:
        LOGGER.info('       {}'.format(file))
        df_pairs_chunk = pd.read_csv(PATHS.patstatglobal / file)
        df_pairs.append(df_pairs_chunk)
    df_pairs = pd.concat(df_pairs)
    df_pairs = df_pairs.rename(columns={'docdb_family_id': 'citing_docdb_family_id'})
    return df_pairs


def add_earliest_filing_date(df_pairs):
    # Import dataframe with EARLIEST FILING YEAR info
    colnames = ['docdb_family_id', 'earliest_filing_date', 'appln_filing_date_min', 'appln_filing_date_max']
    df_fam = pd.read_csv(PATHS.citations / 'docdbid_applninfo.csv', usecols=colnames)
    # Next add earliest filing years of docdb families
    df_pairs = df_pairs.merge(df_fam, left_on='cited_docdb_family_id', right_on='docdb_family_id')
    df_pairs = df_pairs.rename(columns={'earliest_filing_date': 'cited_earliest_filing_date',
                             'appln_filing_date_min': 'cited_appln_filing_date_min',
                             'appln_filing_date_max': 'cited_appln_filing_date_max'})
    df_pairs = df_pairs.drop(columns='docdb_family_id')
    df_pairs = df_pairs.merge(df_fam[['docdb_family_id', 'earliest_filing_date', 'appln_filing_date_min', 'appln_filing_date_max']], left_on='citing_docdb_family_id', right_on='docdb_family_id')
    df_pairs = df_pairs.rename(columns={'earliest_filing_date': 'citing_earliest_filing_date',
                             'appln_filing_date_min': 'citing_appln_filing_date_min',
                             'appln_filing_date_max': 'citing_appln_filing_date_max'})
    return df_pairs



def calculate_output_yearlapse(df_pairs):
    df_pairs['cited_earliest_filing_date'] = df_pairs['cited_earliest_filing_date'].replace('9999-12-31', np.nan)
    df_pairs['cited_earliest_filing_date'] = pd.to_datetime(df_pairs['cited_earliest_filing_date'])
    df_pairs['citing_earliest_filing_date'] = df_pairs['citing_earliest_filing_date'].replace('9999-12-31', np.nan)
    df_pairs['citing_earliest_filing_date'] = pd.to_datetime(df_pairs['citing_earliest_filing_date'])
    df_pairs['lapse_days'] = df_pairs['citing_earliest_filing_date'] - df_pairs['cited_earliest_filing_date']
    df_pairs['lapse_years'] = df_pairs['lapse_days'] / np.timedelta64(1, 'Y')
    desc = df_pairs['lapse_years'].describe()
    LOGGER.info(f'Descriptives lapse_years: \n{desc}')
    percent_withnegative = 100 * df_pairs[df_pairs['lapse_years'] < 0].shape[0] / df_pairs.shape[0]
    LOGGER.info(f'Percent of observations with negative values: {percent_withnegative} % ')
    df_pairs['lapse_years_rounded'] = round(df_pairs['lapse_years'])
    # count nbr of pairs that have a lapse of duration X
    citations_distribution = df_pairs.groupby('lapse_years_rounded')['citing_docdb_family_id'].count().reset_index()
    citations_distribution.to_csv(PATHS.citations / 'docdbid_citationcountdistribution_fromfamilydata.csv', index=False)
    LOGGER.info('Saved  docdbid_citationcountdistribution_fromfamilydata.csv')
    df_citations = pd.DataFrame(df_pairs.groupby('cited_docdb_family_id').nunique()['citing_docdb_family_id'].rename('total'))
    for year in [1, 3, 4, 5, 10, 15]:
        df_citations[f'within_{year}yr'] = df_pairs[df_pairs['lapse_years'] < year].groupby('cited_docdb_family_id')['citing_docdb_family_id'].nunique()
    df_citations = df_citations.reset_index()
    df_citations = df_citations.fillna(0)
    df_citations.to_csv(PATHS.citations / 'docdbid_citationcounts_fromfamilydata.csv', index=False)
    LOGGER.info('Saved  docdbid_citationcounts_fromfamilydata.csv')
    return df_pairs




def output_citationcounts_fromfamilydata():
    LOGGER.info('Begin  output_citationcounts_fromfamilydata')
    df_pairs = get_citationpairs_fromfamilydata()
    df_pairs = add_earliest_filing_date(df_pairs)
    calculate_output_yearlapse(df_pairs)
    return




def combine_citationcounts():
    df_citations_app = pd.read_csv( + f'/data/intermediate_data/b_extracting_from_patstat/docdbid_citationcounts_fromapplndata.csv')
    df_citations_app.index = df_citations_app['cited_docdb_family_id']
    del df_citations_app['cited_docdb_family_id']
    df_citations_app.columns = [k + '_app' for k in df_citations_app.columns]
    df_citations_fam = pd.read_csv( + f'/data/intermediate_data/b_extracting_from_patstat/docdbid_citationcounts_fromfamilydata.csv')
    df_citations_fam.index = df_citations_fam['cited_docdb_family_id']
    del df_citations_fam['cited_docdb_family_id']
    df_citations_fam.columns = [k + '_fam' for k in df_citations_fam.columns]
    df_citations = df_citations_app.merge(df_citations_fam, on='cited_docdb_family_id', how='outer', indicator=True)
    df_citations = df_citations.reset_index()
    df_citations['total_diff'] = df_citations['total_app'] - df_citations['total_fam']
    df_citations['within_3yr_diff'] = df_citations['within_3yr_app'] - df_citations['within_3yr_fam']
    return df_citations


def add_nbrcitationsfromtls201(df_citations):
    df_fam = pd.read_csv( + '/data/intermediate_data/b_extracting_from_patstat/docdbid_applninfo.csv', usecols=['docdb_family_id', 'nb_citing_docdb_fam'])
    df_citations = df_citations.merge(df_fam, left_on='cited_docdb_family_id', right_on='docdb_family_id', how='left')
    return df_citations


def comparing_counts():
    # Check citation counts calculated from application info vs family info
    df_citations = combine_citationcounts()
    """
    Very different. First, it seems that tls 228, with family info, is quite incomplete. not all families are in there.
    Second, the citation counts built from application level info is the same or higher than the one built from family info 
    for 50% of observations it is the same. for 90% it is very close. 
    """
    Nobs = df_citations[df_citations['total_diff'].notnull()].shape[0]
    mask = df_citations['total_diff'] == 0
    percent = round(100 * df_citations[mask].shape[0] / Nobs)
    LOGGER.info(f'Percentage of observations with same citation counts = {percent} %')
    mask = df_citations['total_diff'] <= 3
    percent = round(100 * df_citations[mask].shape[0] / Nobs)
    LOGGER.info(f'Percentage of observations with close citation counts (+0 to +3) = {percent} %')
    mask = df_citations['total_diff'] <= 10
    percent = round(100 * df_citations[mask].shape[0] / Nobs)
    LOGGER.info(f'Percentage of observations with close citation counts (+ 0 to +10) = {percent} %')
    comparing_to_countsfromtsl201(df_citations)
    comparing_to_previouscounts(df_citations)


def comparing_to_countsfromtsl201(df_citations):
    #  Which one is clsoe to the total nbr of citations given in tls 201?
    df_citations = add_nbrcitationsfromtls201(df_citations)
    df_citations['diff_app'] = df_citations['total_app'] - df_citations['nb_citing_docdb_fam']
    df_citations['diff_fam'] = df_citations['total_fam'] - df_citations['nb_citing_docdb_fam']
    df_citations['diff_app'].describe()
    df_citations['diff_fam'].describe()
    #  counts calculated from application level data are much closer to what is shown in tls201.
    # the counts from family info is on average 3 units smaller than (and sometimes 4000 units smaller!)


def add_old_counts(df_citations):
    colnames = ['cited_docdb_family_id', 'sum(nb_citing_docdb_fam_within_3_year)', 'sum(nbr_citing_docdb_fam_tot)']
    df_citations_old = pd.read_csv( + '/derived_from_patstat/countCitedByYear.csv', usecols=colnames)
    df_citations = df_citations.merge(df_citations_old, on='cited_docdb_family_id', how='left')
    return df_citations


def comparing_to_previouscounts(df_citations):
    df_citations = add_old_counts(df_citations)
    df_citations['diff_app'] = df_citations['within_3yr_app'] - df_citations['sum(nb_citing_docdb_fam_within_3_year)']
    df_citations['diff_fam'] = df_citations['within_3yr_fam'] - df_citations['sum(nb_citing_docdb_fam_within_3_year)']
    df_citations['diff_app'].describe()
    df_citations['diff_fam'].describe()
    """
    old vs fam based: mean -0.1   [-1000 to 360]
    old vs app based: mean 0.2 [-1000 to 800]
    """
    Nobs = df_citations[df_citations['diff_app'].notnull()].shape[0]
    mask = df_citations['diff_app'] == 0
    percent = round(100 * df_citations[mask].shape[0] / Nobs)
    LOGGER.info(f'Percentage of observations with same citation counts = {percent} %')
    mask = df_citations['diff_app'] <= 3
    percent = round(100 * df_citations[mask].shape[0] / Nobs)
    LOGGER.info(f'Percentage of observations with close citation counts (+0 to +3) = {percent} %')
    mask = df_citations['diff_app'] <= 10
    percent = round(100 * df_citations[mask].shape[0] / Nobs)
    LOGGER.info(f'Percentage of observations with close citation counts (+ 0 to +10) = {percent} %')





